2 Set up
First of all, start with the required libraries and functions.
library(tidyverse)
library(sf)
library(lubridate)
library(tigris)
library(tidycensus)
library(viridis)
library(riem)
library(gridExtra)
library(knitr)
library(kableExtra)
library(RSocrata)
library(ggplot2)
library(gifski)
library(gganimate)
library(caret)
plotTheme <- theme(
plot.title =element_text(size=12),
plot.subtitle = element_text(size=8),
plot.caption = element_text(size = 6),
axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
axis.text.y = element_text(size = 10),
axis.title.y = element_text(size = 10),
# Set the entire chart region to blank
panel.background=element_blank(),
plot.background=element_blank(),
#panel.border=element_rect(colour="#F0F0F0"),
# Format the grid
panel.grid.major=element_line(colour="#D0D0D0",size=.2),
axis.ticks=element_blank())
mapTheme <- theme(plot.title =element_text(size=12),
plot.subtitle = element_text(size=8),
plot.caption = element_text(size = 6),
axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank(),
panel.background=element_blank(),
panel.border=element_blank(),
panel.grid.major=element_line(colour = 'transparent'),
panel.grid.minor=element_blank(),
legend.direction = "vertical",
legend.position = "right",
plot.margin = margin(1, 1, 1, 1, 'cm'),
legend.key.height = unit(1, "cm"), legend.key.width = unit(0.2, "cm"))
palette5 <- c("#FB8C6F","#73607D","#C1B9AE","#FDC664","#355C7D")
palette4 <- c("#FB8C6F","#73607D","#C1B9AE","#FDC664")
palette2 <- c("#FB8C6F","#73607D")
2.1 Import bike share data
The LA Metro Bike Share trip data (source:https://bikeshare.metro.net/about/data/) for 2022 Q2 is
read in.
dat <- st_read("./metro-trips-2022-q2.csv")
station_name <- st_read("./metro-bike-share-stations-2022-10-01.csv")
colnames(station_name)[colnames(station_name) == 'Station_ID'] <- "start_station"
dat <-
merge(dat, station_name[c("Station_Name", "start_station")], by = "start_station")
# st_transform('ESRI:102286')
As below, parse_date_time function is used to
standardize the start_time field (when a trip departed)
into the 1 hour and 15 minute intervals needed for later analysis.
Function week and wday convert the data/time
stamp into week of the year and day of the week, respectively.
dat2 <- dat %>%
mutate(interval60 = floor_date(mdy_hm(start_time), unit = "hour"),
interval15 = floor_date(mdy_hm(start_time), unit = "15 mins"),
week = week(interval60),
dotw = wday(interval60, label=TRUE)) %>%
mutate(new_start_time = strptime(as.character(start_time), "%m/%d/%Y %H:%M"),
new_end_time = strptime(as.character(end_time), "%m/%d/%Y %H:%M")) %>%
filter(week>17 & week<23)
glimpse(dat2)
## Rows: 26,989
## Columns: 22
## $ start_station <chr> "3000", "3000", "3005", "3005", "3005", "3005", "3…
## $ trip_id <chr> "193851816", "193855710", "191014162", "194147504"…
## $ duration <chr> "4", "74", "4", "64", "1", "16", "16", "4", "84", …
## $ start_time <chr> "5/27/2022 14:59", "5/27/2022 15:04", "5/1/2022 18…
## $ end_time <chr> "5/27/2022 15:03", "5/27/2022 16:18", "5/1/2022 18…
## $ start_lat <chr> "", "", "34.0485", "34.0485", "34.0485", "34.0485"…
## $ start_lon <chr> "", "", "-118.258537", "-118.258537", "-118.258537…
## $ end_station <chr> "3000", "3000", "3031", "3005", "3005", "3030", "3…
## $ end_lat <chr> "", "", "34.044701", "34.0485", "34.0485", "34.051…
## $ end_lon <chr> "", "", "-118.252441", "-118.258537", "-118.258537…
## $ bike_id <chr> "6335", "6335", "20052", "14901", "12158", "5984",…
## $ plan_duration <chr> "30", "365", "30", "1", "30", "30", "30", "30", "3…
## $ trip_route_category <chr> "Round Trip", "Round Trip", "One Way", "Round Trip…
## $ passholder_type <chr> "Monthly Pass", "Annual Pass", "Monthly Pass", "On…
## $ bike_type <chr> "standard", "standard", "standard", "standard", "s…
## $ Station_Name <chr> "Virtual Station", "Virtual Station", "7th & Flowe…
## $ interval60 <dttm> 2022-05-27 14:00:00, 2022-05-27 15:00:00, 2022-05…
## $ interval15 <dttm> 2022-05-27 14:45:00, 2022-05-27 15:00:00, 2022-05…
## $ week <dbl> 21, 21, 18, 22, 20, 20, 20, 22, 20, 22, 20, 19, 21…
## $ dotw <ord> Fri, Fri, Sun, Mon, Sat, Mon, Thu, Wed, Tue, Tue, …
## $ new_start_time <dttm> 2022-05-27 14:59:00, 2022-05-27 15:04:00, 2022-05…
## $ new_end_time <dttm> 2022-05-27 15:03:00, 2022-05-27 16:18:00, 2022-05…
2.2 Import Census Info
The code block below pulls census geography and some variables from
the tidycensus package. These will be used to demonstrate
generalizability later, but won’t be used as independent variables
because they end up being perfectly colinear with the stations fixed
effects. Spatial information will be added to ridershare data as origin
and destination data.
LACensus <-
get_acs(geography = "tract",
variables = c("B01003_001", "B19013_001",
"B02001_002", "B08013_001",
"B08012_001", "B08301_001",
"B08301_010", "B01002_001"),
year = 2020,
state = "CA",
geometry = TRUE,
county=c("Los Angeles"),
output = "wide") %>%
rename(Total_Pop = B01003_001E,
Med_Inc = B19013_001E,
Med_Age = B01002_001E,
White_Pop = B02001_002E,
Travel_Time = B08013_001E,
Num_Commuters = B08012_001E,
Means_of_Transport = B08301_001E,
Total_Public_Trans = B08301_010E) %>%
select(Total_Pop, Med_Inc, White_Pop, Travel_Time,
Means_of_Transport, Total_Public_Trans,
Med_Age,
GEOID, geometry) %>%
mutate(Percent_White = White_Pop / Total_Pop,
Mean_Commute_Time = Travel_Time / Total_Public_Trans,
Percent_Taking_Public_Trans = Total_Public_Trans / Means_of_Transport)
LATracts <-
LACensus %>%
as.data.frame() %>%
distinct(GEOID, .keep_all = TRUE) %>%
select(GEOID, geometry) %>%
st_sf
dat2$start_lon[dat2$start_lon==''] <- NA
dat2$start_lat[dat2$start_lat==''] <- NA
dat2$end_lon[dat2$end_lon==''] <- NA
dat2$end_lat[dat2$end_lat==''] <- NA
dat_census <- st_join(dat2 %>%
filter(is.na(start_lon) == FALSE &
is.na(start_lat) == FALSE &
is.na(end_lat) == FALSE &
is.na(end_lon) == FALSE )%>%
st_as_sf(., coords = c("start_lon", "start_lat"), crs = 4326),
LATracts %>%
st_transform(crs=4326),
join=st_intersects,
left = TRUE) %>%
rename(Origin.Tract = GEOID) %>%
mutate(start_lon = unlist(map(geometry, 1)),
start_lan = unlist(map(geometry, 2)))%>%
as.data.frame() %>%
select(-geometry)%>%
st_as_sf(., coords = c("end_lon", "end_lat"), crs = 4326) %>%
st_join(., LATracts %>%
st_transform(crs=4326),
join=st_intersects,
left = TRUE) %>%
rename(Destination.Tract = GEOID) %>%
mutate(end_lon = unlist(map(geometry, 1)),
end_lat = unlist(map(geometry, 2)))%>%
as.data.frame() %>%
select(-geometry)
2.3 Import Weather Data
It would be reasonable to anticipate that bad weather in the Windy
City would encourage ride sharing. Here, I imported weather data from
Los Angeles Downtown/USC (SID code: CQT, serving period from 1999 till
now) using riem_measures between 2022-05-01 and
2022-05-31.
Below a weather.Panel is generated to summarize
temperature, precipitation and wind speed for every hour in May
2022.
weather.Panel <-
riem_measures(station = "CQT", date_start = "2022-04-30", date_end = "2022-06-03") %>%
dplyr::select(valid, tmpf, p01i, sknt)%>%
replace(is.na(.), 0) %>%
mutate(interval60 = ymd_h(substr(valid,1,13))) %>%
mutate(week = week(interval60),
dotw = wday(interval60, label=TRUE)) %>%
group_by(interval60) %>%
summarize(Temperature = max(tmpf),
Precipitation = sum(p01i),
Wind_Speed = max(sknt)) %>%
mutate(Temperature = ifelse(Temperature == 0, 42, Temperature))
glimpse(weather.Panel)
## Rows: 816
## Columns: 4
## $ interval60 <dttm> 2022-04-30 00:00:00, 2022-04-30 01:00:00, 2022-04-30 02…
## $ Temperature <dbl> 64.9, 62.1, 60.1, 59.0, 57.9, 57.9, 57.0, 57.0, 55.9, 55…
## $ Precipitation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Wind_Speed <dbl> 3, 5, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,…
Below the weather data is plotted as a time series using
grid.arrange.
grid.arrange(
ggplot(weather.Panel, aes(interval60,Precipitation)) + geom_line() +
labs(title="Percipitation", x="Hour", y="Perecipitation") + plotTheme,
ggplot(weather.Panel, aes(interval60,Wind_Speed)) + geom_line() +
labs(title="Wind Speed", x="Hour", y="Wind Speed") + plotTheme,
ggplot(weather.Panel, aes(interval60,Temperature)) + geom_line() +
labs(title="Temperature", x="Hour", y="Temperature") + plotTheme,
top="Weather Data - LA CQT - May, 2022")

3 Describe and Explore the Data
3.1 Create Space-Time Panel
To make sure that each unique and hour/day combo exists in our data
set, an empty study.panel is created which has each unique
space/time observations. No matter an observation took place or not,
each time period in the study is represented by a row in it.
The maximum number of combinations can be calculated by 24 hours per
day * 7 days per week * 5 weeks * the number of stations. Then, compare
that to the number of rows in study.panel by
nrow. Both steps show the same result: 174528, which means
that the counts is correct.
Continue to join the station name, tract and latitude/longitude.
length(unique(dat_census$interval60)) * length(unique(dat_census$start_station))
## [1] 174528
study.panel <-
expand.grid(interval60=unique(dat_census$interval60),
start_station = unique(dat_census$start_station)) %>%
left_join(., dat_census %>%
select(start_station, Station_Name, Origin.Tract, start_lon, start_lan )%>%
distinct() %>%
group_by(start_station) %>%
slice(1))
nrow(study.panel)
## [1] 174528
In the code below, a full panel is summarizing by station for each
time interval, keep cansus info and latitude/longitude information along
for joining later to other data. Start station IDs are removed by
FALSE.
ride.panel <-
dat_census %>%
mutate(Trip_Counter = 1) %>%
right_join(study.panel) %>%
group_by(interval60, start_station, Station_Name, Origin.Tract, start_lon, start_lan) %>% # missing "from_station_name"
summarize(Trip_Count = sum(Trip_Counter, na.rm=T)) %>%
left_join(weather.Panel) %>%
ungroup() %>%
filter(is.na(start_station) == FALSE) %>%
mutate(week = week(interval60),
dotw = wday(interval60, label = TRUE)) %>%
filter(is.na(Origin.Tract) == FALSE)
ride.panel <-
left_join(ride.panel, LACensus %>%
as.data.frame() %>%
select(-geometry), by = c("Origin.Tract" = "GEOID"))
3.2 Create time lags
ride.panel <-
ride.panel %>%
arrange(start_station, interval60) %>%
mutate(lagHour = dplyr::lag(Trip_Count,1),
lag2Hours = dplyr::lag(Trip_Count,2),
lag3Hours = dplyr::lag(Trip_Count,3),
lag4Hours = dplyr::lag(Trip_Count,4),
lag12Hours = dplyr::lag(Trip_Count,12),
lag1day = dplyr::lag(Trip_Count,24),
holiday = ifelse(yday(interval60) == 148,1,0)) %>%
mutate(day = yday(interval60)) %>%
mutate(holidayLag = case_when(dplyr::lag(holiday, 1) == 1 ~ "PlusOneDay",
dplyr::lag(holiday, 2) == 1 ~ "PlustTwoDays",
dplyr::lag(holiday, 3) == 1 ~ "PlustThreeDays",
dplyr::lead(holiday, 1) == 1 ~ "MinusOneDay",
dplyr::lead(holiday, 2) == 1 ~ "MinusTwoDays",
dplyr::lead(holiday, 3) == 1 ~ "MinusThreeDays"),
holidayLag = ifelse(is.na(holidayLag) == TRUE, 0, holidayLag))
as.data.frame(ride.panel) %>%
group_by(interval60) %>%
summarise_at(vars(starts_with("lag"), "Trip_Count"), mean, na.rm = TRUE) %>%
gather(Variable, Value, -interval60, -Trip_Count) %>%
mutate(Variable = factor(Variable, levels=c("lagHour","lag2Hours","lag3Hours","lag4Hours",
"lag12Hours","lag1day")))%>%
group_by(Variable) %>%
summarize(correlation = round(cor(Value, Trip_Count),2))
## # A tibble: 6 × 2
## Variable correlation
## <fct> <dbl>
## 1 lagHour 0.89
## 2 lag2Hours 0.76
## 3 lag3Hours 0.59
## 4 lag4Hours 0.39
## 5 lag12Hours -0.81
## 6 lag1day 0.83
3.3 Traning/testing data set
Here, a time series approach is taken, a training data set with 3
weeks of data (weeks 18-20) and a testing data set with 2 weeks of data
(weeks 21-22) are created. Code below split the data by
week.
ride.Train <- filter(ride.panel, week >= 18 & week <= 20)
ride.Test <- filter(ride.panel, week >=21 & week <= 22)
3.4 Exploratoring data
In this section, the rider share data is explored for time, space,
weather, and demographic relationships. If the Trip_Count
exhibit serial(temporal) correlation, the time lag features will lead to
better predictions.
3.4.1 Serial autocorrelation
We begin by examining the time and frequency components of our
data.
First, we look at the overall time pattern - there is clearly a daily
periodicity and there are lull periods on weekends.
geo-vline is used to visualize Mondays. All of
the five weeks exhibit similar time series patterns with consistent
peaks and troughs. Which suggests the presence of serial
correlation.
mondays <-
mutate(ride.panel,
monday = ifelse(dotw == "Mon" & hour(interval60) == 1,
interval60, 0)) %>%
filter(monday != 0)
st_drop_geometry(rbind(
mutate(ride.Train, Legend = "Training"),
mutate(ride.Test, Legend = "Testing"))) %>%
group_by(Legend, interval60) %>%
summarize(Trip_Count = sum(Trip_Count)) %>%
ungroup() %>%
ggplot(aes(interval60, Trip_Count, colour = Legend)) + geom_line() +
scale_colour_manual(values = palette2) +
geom_vline(data = mondays, aes(xintercept = monday)) +
labs(title="LA Rideshare trips by week: Apr 30 - Jun 03, 2022",
subtitle="Dotted lines for Mondays",
x="Day", y="Trip Count",
caption = "Figure 3-1") +
plotTheme + theme(panel.grid.major = element_blank())

Next, the time lag features are tested for correlation
with Trip_Count. plotData.lag returns the
Trip_Count and time lag features for week 18.
fct_relevel reorders the lag levels. Omit that line and try
levels(plotData.lag$Variable).
Pearson correlation is then calculated for each Variable
in correlation.lag.
Weak Trip_Count correlations are visualized below.
plotData.lag <-
filter(as.data.frame(ride.panel), week == 18) %>%
dplyr::select(starts_with("lag"), Trip_Count) %>%
gather(Variable, Value, -Trip_Count) %>%
mutate(Variable = fct_relevel(Variable, "lagHour","lag2Hours","lag3Hours",
"lag4Hours","lag12Hours","lag1day"))
correlation.lag <-
group_by(plotData.lag, Variable) %>%
summarize(correlation = round(cor(Value, Trip_Count, use = "complete.obs"), 2))
plotData.lag %>%
ggplot()+
geom_point(aes(x= Value, y = Trip_Count))+
geom_text(data = correlation.lag, aes(label = paste("r =", round(correlation, 2))),
x=-Inf, y=Inf, vjust = 1.5, hjust = -.1) +
geom_smooth(aes(x= Value, y= Trip_Count), method = "lm", se = FALSE, color = "73607D")+
geom_abline(slope = 1, intercept = 0)+
facet_wrap(~Variable, ncol = 2, scales = "free") +
labs(title="Ridershare trip count as a function of time lags",
x="Lag Trip Count",
y="Trip_Count",
caption = "Figure 3-2")+
plotTheme

Figure 3-3 illustrates the distribution of trip volume by station for
different times of the day. There are some stations with high volume
periods, but more have low volume. Therefore, our data will consist of a
number of low demand station/hours and a few high demand station
hours.
We can also track the daily trends in ridership by day of the week
and weekend versus weekday, to see what temporal patterns we’d like to
control for.
dat_census %>%
mutate(time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush"))%>%
group_by(interval60, start_station, time_of_day) %>%
tally()%>%
group_by(start_station, time_of_day)%>%
summarize(mean_trips = mean(n))%>%
ggplot()+
geom_histogram(aes(mean_trips), binwidth = 1)+
labs(title="Mean Number of Hourly Trips Per Station. LA, Apr 30 - Jun 03, 2022",
x="Number of trips",
y="Frequency",
caption = "Figure 3-3")+
facet_wrap(~time_of_day)+
plotTheme

ggplot(dat_census %>%
group_by(interval60, start_station) %>%
tally())+
geom_histogram(aes(n), binwidth = 5)+
labs(title="Bike share trips per hr by station. LA, Apr 30 - Jun 03, 2022",
x="Trip Counts",
y="Number of Stations",
caption = "Figure 3-4")+
plotTheme

Figure 3-5 and 3-6 illustrate total Trip_Counts
distribution in one day by weekdays and weekends. It clear that the trip
volume of each day in a week have similar patterns: reaching peak at
about 400 in 12:00-17:00 and falling at near 0 value in 3:00 or 24:00.
Weekdays have almost double peak volume than weekends.
ggplot(dat_census %>% mutate(hour = hour(new_start_time)))+
geom_freqpoly(aes(hour, color = dotw), binwidth = 1)+
labs(title="Bike share trips in LA, by day of the week, May, 2022",
x="Hour",
y="Trip Counts",
caption = "Figure 3-5")+
plotTheme

ggplot(dat_census %>%
mutate(hour = hour(new_start_time),
weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday")))+
geom_freqpoly(aes(hour, color = weekend), binwidth = 1)+
labs(title="Bike share trips in LA - weekend vs weekday, Apr 30 - Jun 03, 2022",
x="Hour",
y="Trip Counts",
caption = "Figure 3-6")+
plotTheme

3.4.2 Spatial autocorrelation
In this part, the spatial autocorrelation will be examined. Figure
3-7 shows the Trip-Count per hour by station in weekdays
and weekends, respectively.
ggplot()+
geom_sf(data = LATracts %>%
st_transform(crs=4326))+
geom_point(data = dat_census %>%
mutate(hour = hour(new_start_time),
weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush"))%>%
group_by(start_station, start_lan, start_lon, weekend, time_of_day) %>%
tally(),
aes(x=start_lon, y = start_lan, color = n),
fill = "transparent", alpha = 0.5, size = 0.5)+
scale_colour_viridis(direction = -1,
discrete = FALSE, option = "D")+
ylim(min(dat_census$start_lan), max(dat_census$start_lan))+
xlim(min(dat_census$start_lon), max(dat_census$start_lon))+
facet_grid(weekend ~ time_of_day)+
labs(title="Bike share trips per hr by station. LA, May, 2022",
caption = "Figure 3-7")+
mapTheme

3.4.3 Space/time correlation
week20 <-
filter(ride.panel , week == 20 & dotw == "Mon")
week20.panel <-
expand.grid(
interval60 = unique(week20$interval60),
start_station = unique(ride.panel$start_station))
ride.animation.data <-
mutate(week20, Trip_Counter = 1) %>%
select(interval60, start_station, start_lon, start_lan, Trip_Counter) %>%
right_join(week20.panel) %>%
group_by(interval60, start_station, start_lon, start_lan,) %>%
summarize(Trip_Count = sum(Trip_Counter, na.rm=T)) %>%
ungroup() %>%
mutate(Trips = case_when(Trip_Count == 0 ~ "0 trips",
Trip_Count > 0 & Trip_Count <= 2 ~ "1-2 trips",
Trip_Count > 2 & Trip_Count <= 5 ~ "3-5 trips",
Trip_Count > 5 & Trip_Count <= 10 ~ "6-10 trips",
Trip_Count > 10 & Trip_Count <= 15 ~ "11-15 trips",
Trip_Count > 15 ~ "16+ trips")) %>%
mutate(Trips = fct_relevel(Trips, "0 trips","1-2 trips","3-5 trips",
"6-10 trips","6-10 trips","16+ trips"))
rideshare_animation <-
ggplot() +
geom_sf(data = LATracts %>%
st_transform(crs=4326)) +
geom_point(data = ride.animation.data,
aes(x = start_lon, y = start_lan, fill = Trips), size = 0.5, alpha = 1.5)+
scale_color_manual(values = palette5) +
labs(title = "Rideshare pickups for one day in May 2022, LA",
subtitle = "60 minute intervals: {current_frame}",
caption = "Figure 3-8") +
transition_manual(interval60) +
mapTheme
animate(rideshare_animation, duration=75, renderer = gifski_renderer())
rideshare_animation <-
ggplot() +
geom_sf(data = ride.animation.data, aes(fill = Trips)) +
scale_fill_manual(values = palette5) +
labs(title = "Rideshare pickups for one day in November 2018",
subtitle = "15 minute intervals: {current_frame}") +
transition_manual(interval15) +
mapTheme
4 Modelling
4.1 Building model
In this section, five different linear regressions are estimated on
ride.Train, each with different fixed effects: 1.
reg1 focuses on just time, including hour fixed effects,
day of the week, and Temperature. 2. reg2
focuses on just space effects with the start_station fixed
effects. 3. reg3 includes both time and space fixed
effects. 4. reg4 adds the time lag features.
5. reg5 adds the holiday lag features.
reg1 <-
lm(Trip_Count ~ hour(interval60) + dotw + Temperature, data=ride.Train)
reg2 <-
lm(Trip_Count ~ start_station + dotw + Temperature, data=ride.Train)
reg3 <-
lm(Trip_Count ~ start_station + hour(interval60) + dotw + Temperature + Precipitation,
data=ride.Train)
reg4 <-
lm(Trip_Count ~ start_station + hour(interval60) + dotw + Temperature + Precipitation +
lagHour + lag2Hours +lag3Hours + lag12Hours + lag1day,
data=ride.Train)
reg5 <-
lm(Trip_Count ~ start_station + hour(interval60) + dotw + Temperature + Precipitation +
lagHour + lag2Hours +lag3Hours +lag12Hours + lag1day + holiday, # + holidayLag ,
data=ride.Train)
4.2 Predict for test data
Models built above will be used for predicting data in
ride.Test which includes 2 sf tibbles (with
geometries).
ride.Test.weekNest <-
ride.Test %>%
nest(-week)
ride.Test.weekNest
## # A tibble: 2 × 2
## week data
## <dbl> <list>
## 1 21 <tibble [35,096 × 30]>
## 2 22 <tibble [34,026 × 30]>
Next, a function is created that takes a tibble, dat and
a regression model, fit as its inputs, and outputs
predictions as pred. Each week in
ride.Trest.weekNest can be predicted by this function.
model_pred <- function(dat, fit){
pred <- predict(fit, newdata = dat)}
The code below will loop through each model for each testing week and
mutate summary statistics, including the predictions, observed, absolute
error, MAE and SD AE.
week_predictions <-
ride.Test.weekNest %>%
mutate(ATime_FE = map(.x = data, fit = reg1, .f = model_pred),
BSpace_FE = map(.x = data, fit = reg2, .f = model_pred),
CTime_Space_FE = map(.x = data, fit = reg3, .f = model_pred),
DTime_Space_FE_timeLags = map(.x = data, fit = reg4, .f = model_pred),
ETime_Space_FE_timeLags_holidayLags = map(.x = data, fit = reg5, .f = model_pred)) %>%
gather(Regression, Prediction, -data, -week) %>%
mutate(Observed = map(data, pull, Trip_Count),
Absolute_Error = map2(Observed, Prediction, ~ abs(.x - .y)),
MAE = map_dbl(Absolute_Error, mean, na.rm = TRUE),
sd_AE = map_dbl(Absolute_Error, sd, na.rm = TRUE))
week_predictions
A 2 (weeks) times 5 (models) table is created above, each row
presents the statistics of one week in ride.Trest.weekNest
predicted by a regression model.
5 Examing accuracy
In this section, Mean Absolute Error(MAE) is calculated on
ride.Test for each model.
5.1 Examine Error Metrics for Accuracy
Figure 5-1 shows goodness of fit by week and model. The MAE for the
time effects model (reg1) in week 21 is comparable to the
mean observed Trip_Count of 0.256. With increasing
sophistication, the model becomes more accurate and more
generalizable.
week_predictions %>%
dplyr::select(week, Regression, MAE) %>%
gather(Variable, MAE, -Regression, -week) %>%
ggplot(aes(week, MAE)) +
geom_bar(aes(fill = Regression), position = "dodge", stat="identity") +
scale_fill_manual(values = palette5) +
labs(title = "Mean Absolute Errors by model specification and week",
caption = "Figure 5-1") +
plotTheme

For each model, predicted and observed Trip_Count is
taken out of the spatial context and their means of plotted in time
series form below. It seems that model A and C have the same time trend
because the spatial context has been removed.
The time lags facilitate the capacity to predict for the highest
peaks as sophistication increases.
week_predictions %>%
mutate(interval60 = map(data, pull, interval60),
start_station = map(data, pull, start_station)) %>%
dplyr::select(interval60, start_station, Observed, Prediction, Regression) %>%
unnest() %>%
gather(Variable, Value, -Regression, -interval60, -start_station) %>%
group_by(Regression, Variable, interval60) %>%
summarize(Value = sum(Value)) %>%
ggplot(aes(interval60, Value, colour=Variable)) +
geom_line(size = 1.1) +
facet_wrap(~Regression, ncol=1) +
labs(title = "Predicted/Observed bike share time series", subtitle = "LA; A test set of 2 weeks", x = "Hour", y= "Station Trips",
caption = "Figure 5-2") +
plotTheme

Figure 5-3 illustrates the observed and predicted distribution for
different times of day durig weekdays and weekends. It is clear that the
models are under predicting in general.
week_predictions %>%
mutate(interval60 = map(data, pull, interval60),
start_station = map(data, pull, start_station),
start_lan = map(data, pull, start_lan),
start_lon = map(data, pull, start_lon),
dotw = map(data, pull, dotw)) %>%
select(interval60, start_lan, start_lon,
start_station, Observed, Prediction, Regression,
dotw) %>%
unnest() %>%
filter(Regression == "ETime_Space_FE_timeLags_holidayLags")%>%
mutate(weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush"))%>%
ggplot()+
geom_point(aes(x= Observed, y = Prediction))+
geom_smooth(aes(x= Observed, y= Prediction), method = "lm", se = FALSE, color = "red")+
geom_abline(slope = 1, intercept = 0)+
facet_grid(time_of_day~weekend)+
labs(title="Observed vs Predicted",
x="Observed trips",
y="Predicted trips",
caption = "Figure 5-3")+
plotTheme

5.2 Space-Time Error Evaluation
Figure 5-4 plots mean absolute errors on map by weekdays/weekends and
time of a day.
week_predictions %>%
mutate(interval60 = map(data, pull, interval60),
start_station = map(data, pull, start_station),
start_lan = map(data, pull, start_lan),
start_lon = map(data, pull, start_lon),
dotw = map(data, pull, dotw) ) %>%
select(interval60, start_station, start_lon,
start_lan, Observed, Prediction, Regression,
dotw) %>%
unnest() %>%
filter(Regression == "ETime_Space_FE_timeLags_holidayLags")%>%
mutate(weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush")) %>%
group_by(start_station, weekend, time_of_day, start_lon, start_lan) %>%
summarize(MAE = mean(abs(Observed-Prediction), na.rm = TRUE))%>%
ggplot(.)+
geom_sf(data = LACensus, color = "grey", fill = "transparent")+
geom_point(aes(x = start_lon, y = start_lan, color = MAE),
fill = "transparent", size = 0.5, alpha = 0.4)+
scale_colour_viridis(direction = -1,
discrete = FALSE, option = "D")+
ylim(min(dat_census$start_lan), max(dat_census$start_lan))+
xlim(min(dat_census$start_lon), max(dat_census$start_lon))+
facet_grid(weekend~time_of_day)+
labs(title="Mean Absolute Errors, Test Set",
caption = "Figure 5-4")+
mapTheme

5.3 Socio-economic variables
One might think that the station locations probably relate to likely
users, who seem to be commuting downtown to the loop. Figure 5-4 plots
MAE by three socio-economic variables: median income, percent taking
public transportation and percent of white. Several stations show
resistance to the models - they have high income, low public
transportation usage and more than a half of white population.
week_predictions %>%
mutate(interval60 = map(data, pull, interval60),
start_station = map(data, pull, start_station),
start_lan = map(data, pull, start_lan),
start_lon = map(data, pull, start_lon),
dotw = map(data, pull, dotw),
Percent_Taking_Public_Trans = map(data, pull, Percent_Taking_Public_Trans),
Med_Inc = map(data, pull, Med_Inc),
Percent_White = map(data, pull, Percent_White)) %>%
select(interval60, start_station, start_lon,
start_lan, Observed, Prediction, Regression,
dotw, Percent_Taking_Public_Trans, Med_Inc, Percent_White) %>%
unnest() %>%
filter(Regression == "ETime_Space_FE_timeLags_holidayLags")%>%
mutate(weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush")) %>%
filter(time_of_day == "AM Rush") %>%
group_by(start_station, Percent_Taking_Public_Trans, Med_Inc, Percent_White) %>%
summarize(MAE = mean(abs(Observed-Prediction), na.rm = TRUE))%>%
gather(-start_station, -MAE, key = "variable", value = "value")%>%
ggplot(.)+
#geom_sf(data = LACensus, color = "grey", fill = "transparent")+
geom_point(aes(x = value, y = MAE), alpha = 0.4)+
geom_smooth(aes(x = value, y = MAE), method = "lm", se= FALSE)+
facet_wrap(~variable, scales = "free")+
labs(title="Errors as a function of socio-economic variables",
y="Mean Absolute Error (Trips)",
caption = "Figure 5-4")+
plotTheme

6 Cross-Validation
Below, a parameter called fitControl is set to specify
the number of k-fold prtitions - in this case 100. In the code below,
set.seed ensures reproducible folds. An object
reg.cv, is estimated using the same regression as specified
in reg 4.
fitControl <- trainControl(method = "cv", number = 100)
set.seed(8888)
reg.cv <-
train(Trip_Count ~ start_station + hour(interval60) + dotw + Temperature + Precipitation +
lagHour + lag2Hours +lag3Hours + lag12Hours + lag1day,
data=ride.panel,
method = "lm", trControl = fitControl, na.action = na.pass)
reg.cv
## Linear Regression
##
## 172912 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (100 fold)
## Summary of sample sizes: 171183, 171183, 171183, 171183, 171183, 171183, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.497709 0.2339435 0.2295348
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
The cross-validation output provides very important goodness of fit
information. The resample shows the first five outputs of
100 folds.
reg.cv$resample[1:5,]
## RMSE Rsquared MAE Resample
## 1 0.4378364 0.24580020 0.2218254 Fold001
## 2 0.4857485 0.34633998 0.2280337 Fold002
## 3 0.4930859 0.20575242 0.2330104 Fold003
## 4 0.4642082 0.17568579 0.2326678 Fold004
## 5 0.4936677 0.06945884 0.2304217 Fold005
MAE <- reg.cv$resample[,3]
df <- data.frame(MAE)
ggplot(df, aes(x=MAE)) +
geom_histogram(aes(y=..density..),bins=35, fill="#FB8C6F") +
labs(
x="Cross-Validation MAE Histogram",
y="MAE",
caption = "Figure 6-1"
)

plotTheme
---
title: "Bike share"
author: "Yingxue Ou"
date: "2022-11-11"
output: 
  html_document:
    toc: true
    toc_float: true
    code_folding: "hide"
    code_download: true
---

# 1 Introduction

The requirement to "re-balance" bicycles throughout the network is one of the difficult operational issues faced by urban bike share systems. If there are no bikes available for pickup at a dock or available docking spots to leave a bike, bike share is useless. Re-balancing is the activity of physically redistributing bikes to ensure that a bike or a docking location is accessible when needed while anticipating (or projecting) bike sharing demand for all docks at all times.

Rebalancing can be achieved by multiple ways, transporting excess bikes to where they are needed by trucks might be one of the most operational and efficient ways. Of course, it is also a feasible way to encourage cyclists to park their bicycles in designated locations through appropriate incentives. All in all, almost all rebalancing methods are highly dependent on accurate analysis and prediction of the spatial and time distribution of bikes. In this project, the LA Metro Bike Share trip data (source:https://bikeshare.metro.net/about/data/) for 2022 Q2 will be used to build models and forecast space/time demand for bike share usages. 

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# 2 Set up

First of all, start with the required libraries and functions. 

```{r setup_12, cache=TRUE, message=FALSE}
library(tidyverse)
library(sf)
library(lubridate)
library(tigris)
library(tidycensus)
library(viridis)
library(riem)
library(gridExtra)
library(knitr)
library(kableExtra)
library(RSocrata)
library(ggplot2)
library(gifski)
library(gganimate)
library(caret)
plotTheme <- theme(
  plot.title =element_text(size=12),
  plot.subtitle = element_text(size=8),
  plot.caption = element_text(size = 6),
  axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
  axis.text.y = element_text(size = 10),
  axis.title.y = element_text(size = 10),
  # Set the entire chart region to blank
  panel.background=element_blank(),
  plot.background=element_blank(),
  #panel.border=element_rect(colour="#F0F0F0"),
  # Format the grid
  panel.grid.major=element_line(colour="#D0D0D0",size=.2),
  axis.ticks=element_blank())
mapTheme <- theme(plot.title =element_text(size=12),
                  plot.subtitle = element_text(size=8),
                  plot.caption = element_text(size = 6),
                  axis.line=element_blank(),
                  axis.text.x=element_blank(),
                  axis.text.y=element_blank(),
                  axis.ticks=element_blank(),
                  axis.title.x=element_blank(),
                  axis.title.y=element_blank(),
                  panel.background=element_blank(),
                  panel.border=element_blank(),
                  panel.grid.major=element_line(colour = 'transparent'),
                  panel.grid.minor=element_blank(),
                  legend.direction = "vertical", 
                  legend.position = "right",
                  plot.margin = margin(1, 1, 1, 1, 'cm'),
                  legend.key.height = unit(1, "cm"), legend.key.width = unit(0.2, "cm"))
palette5 <- c("#FB8C6F","#73607D","#C1B9AE","#FDC664","#355C7D")
palette4 <- c("#FB8C6F","#73607D","#C1B9AE","#FDC664")
palette2 <- c("#FB8C6F","#73607D")
```

```{r install_census_API_key, warning = FALSE, include=FALSE, eval = TRUE}
# Install Census API Key
tidycensus::census_api_key("a1244963c219132f323b6fef75d30ba4d939d1c5", overwrite = TRUE)
```

## 2.1 Import bike share data

The LA Metro Bike Share trip data (source:https://bikeshare.metro.net/about/data/) for 2022 Q2 is read in.

```{r read_dat , results='hide', message=FALSE , warning=FALSE}
dat <- st_read("./metro-trips-2022-q2.csv") 
station_name <- st_read("./metro-bike-share-stations-2022-10-01.csv")

colnames(station_name)[colnames(station_name) == 'Station_ID'] <- "start_station"

dat <-
  merge(dat, station_name[c("Station_Name", "start_station")], by = "start_station")
#  st_transform('ESRI:102286')
```

As below, `parse_date_time` function is used to standardize the `start_time` field (when a trip departed) into the 1 hour and 15 minute intervals needed for later analysis. Function `week` and `wday` convert the data/time stamp into week of the year and day of the week, respectively.

```{r time_bins }
dat2 <- dat %>%
  mutate(interval60 = floor_date(mdy_hm(start_time), unit = "hour"),
         interval15 = floor_date(mdy_hm(start_time), unit = "15 mins"),
         week = week(interval60),
         dotw = wday(interval60, label=TRUE)) %>%
  mutate(new_start_time = strptime(as.character(start_time), "%m/%d/%Y %H:%M"),
         new_end_time = strptime(as.character(end_time), "%m/%d/%Y %H:%M")) %>%
  filter(week>17 & week<23)
glimpse(dat2)
```

## 2.2 Import Census Info

The code block below pulls census geography and some variables from the `tidycensus` package. These will be used to demonstrate generalizability later, but won't be used as independent variables because they end up being perfectly colinear with the stations fixed effects. Spatial information will be added to ridershare data as origin and destination data.

```{r get_census, message=FALSE, warning=FALSE, cache=TRUE, results = 'hide'}
LACensus <- 
  get_acs(geography = "tract", 
          variables = c("B01003_001", "B19013_001", 
                        "B02001_002", "B08013_001",
                        "B08012_001", "B08301_001", 
                        "B08301_010", "B01002_001"), 
          year = 2020, 
          state = "CA", 
          geometry = TRUE, 
          county=c("Los Angeles"),
          output = "wide") %>%
  rename(Total_Pop =  B01003_001E,
         Med_Inc = B19013_001E,
         Med_Age = B01002_001E,
         White_Pop = B02001_002E,
         Travel_Time = B08013_001E,
         Num_Commuters = B08012_001E,
         Means_of_Transport = B08301_001E,
         Total_Public_Trans = B08301_010E) %>%
  select(Total_Pop, Med_Inc, White_Pop, Travel_Time,
         Means_of_Transport, Total_Public_Trans,
         Med_Age,
         GEOID, geometry) %>%
  mutate(Percent_White = White_Pop / Total_Pop,
         Mean_Commute_Time = Travel_Time / Total_Public_Trans,
         Percent_Taking_Public_Trans = Total_Public_Trans / Means_of_Transport)
```

```{r extract_geometries }
LATracts <- 
  LACensus %>%
  as.data.frame() %>%
  distinct(GEOID, .keep_all = TRUE) %>%
  select(GEOID, geometry) %>% 
  st_sf
```

```{r add_census_tracts , message = FALSE, warning = FALSE}

dat2$start_lon[dat2$start_lon==''] <- NA
dat2$start_lat[dat2$start_lat==''] <- NA
dat2$end_lon[dat2$end_lon==''] <- NA
dat2$end_lat[dat2$end_lat==''] <- NA

dat_census <-  st_join(dat2 %>% 
          filter(is.na(start_lon) == FALSE &
                   is.na(start_lat) == FALSE &
                   is.na(end_lat) == FALSE &
                   is.na(end_lon) == FALSE )%>%
          st_as_sf(., coords = c("start_lon", "start_lat"), crs = 4326),
        LATracts %>%
          st_transform(crs=4326),
        join=st_intersects,
              left = TRUE) %>%
  rename(Origin.Tract = GEOID) %>%
  mutate(start_lon = unlist(map(geometry, 1)),
         start_lan = unlist(map(geometry, 2)))%>%
  as.data.frame() %>%
  select(-geometry)%>%
  st_as_sf(., coords = c("end_lon", "end_lat"), crs = 4326) %>%
  st_join(., LATracts %>%
            st_transform(crs=4326),
          join=st_intersects,
          left = TRUE) %>%
  rename(Destination.Tract = GEOID)  %>%
  mutate(end_lon = unlist(map(geometry, 1)),
         end_lat = unlist(map(geometry, 2)))%>%
  as.data.frame() %>%
  select(-geometry)
```

## 2.3 Import Weather Data

It would be reasonable to anticipate that bad weather in the Windy City would encourage ride sharing. Here, I imported weather data from Los Angeles Downtown/USC (SID code: CQT, serving period from 1999 till now) using `riem_measures` between 2022-05-01 and 2022-05-31.

Below a `weather.Panel` is generated to summarize temperature, precipitation and wind speed for every hour in May 2022.

```{r weather}
weather.Panel <- 
  riem_measures(station = "CQT", date_start = "2022-04-30", date_end = "2022-06-03") %>%
  dplyr::select(valid, tmpf, p01i, sknt)%>%
  replace(is.na(.), 0) %>%
    mutate(interval60 = ymd_h(substr(valid,1,13))) %>%
    mutate(week = week(interval60),
           dotw = wday(interval60, label=TRUE)) %>%
    group_by(interval60) %>%
    summarize(Temperature = max(tmpf),
              Precipitation = sum(p01i),
              Wind_Speed = max(sknt)) %>%
    mutate(Temperature = ifelse(Temperature == 0, 42, Temperature))

glimpse(weather.Panel)
```

Below the weather data is plotted as a time series using `grid.arrange`.

```{r plot_weather, catche = TRUE}
grid.arrange(
  ggplot(weather.Panel, aes(interval60,Precipitation)) + geom_line() + 
  labs(title="Percipitation", x="Hour", y="Perecipitation") + plotTheme,
  ggplot(weather.Panel, aes(interval60,Wind_Speed)) + geom_line() + 
    labs(title="Wind Speed", x="Hour", y="Wind Speed") + plotTheme,
  ggplot(weather.Panel, aes(interval60,Temperature)) + geom_line() + 
    labs(title="Temperature", x="Hour", y="Temperature") + plotTheme,
  top="Weather Data - LA CQT - May, 2022")
```

# 3 Describe and Explore the Data

## 3.1 Create Space-Time Panel

To make sure that each unique and hour/day combo exists in our data set, an empty `study.panel` is created which has each unique space/time observations. No matter an observation took place or not, each time period in the study is represented by a row in it. 

The maximum number of combinations can be calculated by 24 hours per day * 7 days per week * 5 weeks * the number of stations. Then, compare that to the number of rows in `study.panel` by `nrow`. Both steps show the same result: 174528, which means that the counts is correct.

Continue to join the station name, tract and latitude/longitude.

```{r panel_length_check , message = FALSE, warning = FALSE}
length(unique(dat_census$interval60)) * length(unique(dat_census$start_station))

study.panel <- 
  expand.grid(interval60=unique(dat_census$interval60), 
              start_station = unique(dat_census$start_station)) %>%
  left_join(., dat_census %>%
              select(start_station, Station_Name, Origin.Tract, start_lon, start_lan )%>% 
              distinct() %>%
              group_by(start_station) %>%
              slice(1))
nrow(study.panel)      
```

In the code below, a full panel is summarizing by station for each time interval, keep cansus info and latitude/longitude information along for joining later to other data. Start station IDs are removed by `FALSE`.

```{r create_panel , message = FALSE}
ride.panel <- 
  dat_census %>%
  mutate(Trip_Counter = 1) %>%
  right_join(study.panel) %>% 
  group_by(interval60, start_station, Station_Name, Origin.Tract, start_lon, start_lan) %>% # missing "from_station_name"
  summarize(Trip_Count = sum(Trip_Counter, na.rm=T)) %>%
  left_join(weather.Panel) %>%
  ungroup() %>%
  filter(is.na(start_station) == FALSE) %>%
  mutate(week = week(interval60),
         dotw = wday(interval60, label = TRUE)) %>%
  filter(is.na(Origin.Tract) == FALSE)
```

```{r census_and_panel , message = FALSE}
ride.panel <- 
  left_join(ride.panel, LACensus %>%
              as.data.frame() %>%
              select(-geometry), by = c("Origin.Tract" = "GEOID"))
```

## 3.2 Create time lags

```{r time_lags , message = FALSE}
ride.panel <- 
  ride.panel %>% 
  arrange(start_station, interval60) %>% 
  mutate(lagHour = dplyr::lag(Trip_Count,1),
         lag2Hours = dplyr::lag(Trip_Count,2),
         lag3Hours = dplyr::lag(Trip_Count,3),
         lag4Hours = dplyr::lag(Trip_Count,4),
         lag12Hours = dplyr::lag(Trip_Count,12),
         lag1day = dplyr::lag(Trip_Count,24),
         holiday = ifelse(yday(interval60) == 148,1,0)) %>%
   mutate(day = yday(interval60)) %>%
   mutate(holidayLag = case_when(dplyr::lag(holiday, 1) == 1 ~ "PlusOneDay",
                                 dplyr::lag(holiday, 2) == 1 ~ "PlustTwoDays",
                                 dplyr::lag(holiday, 3) == 1 ~ "PlustThreeDays",
                                 dplyr::lead(holiday, 1) == 1 ~ "MinusOneDay",
                                 dplyr::lead(holiday, 2) == 1 ~ "MinusTwoDays",
                                 dplyr::lead(holiday, 3) == 1 ~ "MinusThreeDays"),
         holidayLag = ifelse(is.na(holidayLag) == TRUE, 0, holidayLag))
```

```{r evaluate_lags , warning = FALSE, message = FALSE}
as.data.frame(ride.panel) %>%
    group_by(interval60) %>% 
    summarise_at(vars(starts_with("lag"), "Trip_Count"), mean, na.rm = TRUE) %>%
    gather(Variable, Value, -interval60, -Trip_Count) %>%
    mutate(Variable = factor(Variable, levels=c("lagHour","lag2Hours","lag3Hours","lag4Hours",
                                                "lag12Hours","lag1day")))%>%
    group_by(Variable) %>%  
    summarize(correlation = round(cor(Value, Trip_Count),2))
```

## 3.3 Traning/testing data set

Here, a time series approach is taken, a training data set with 3 weeks of data (weeks 18-20) and a testing data set with 2 weeks of data (weeks 21-22) are created. Code below split the data by `week`.

```{r train_test }
ride.Train <- filter(ride.panel, week >= 18 & week <= 20)
ride.Test <- filter(ride.panel, week >=21 & week <= 22)
```

## 3.4 Exploratoring data

In this section, the rider share data is explored for time, space, weather, and demographic relationships. If the `Trip_Count` exhibit serial(temporal) correlation, the time lag features will lead to better predictions.

### 3.4.1 Serial autocorrelation

We begin by examining the time and frequency components of our data.

First, we look at the overall time pattern - there is clearly a daily periodicity and there are lull periods on weekends. `geo-vline` is used to visualize `Mondays`. All of the five weeks exhibit similar time series patterns with consistent peaks and troughs. Which suggests the presence of serial correlation.

```{r trip_timeseries,results='hide', message=FALSE , warning=FALSE}
mondays <- 
  mutate(ride.panel,
         monday = ifelse(dotw == "Mon" & hour(interval60) == 1,
                         interval60, 0)) %>%
  filter(monday != 0) 

st_drop_geometry(rbind(
  mutate(ride.Train, Legend = "Training"), 
  mutate(ride.Test, Legend = "Testing"))) %>%
    group_by(Legend, interval60) %>% 
      summarize(Trip_Count = sum(Trip_Count)) %>%
      ungroup() %>% 
      ggplot(aes(interval60, Trip_Count, colour = Legend)) + geom_line() +
        scale_colour_manual(values = palette2) +
        geom_vline(data = mondays, aes(xintercept = monday)) +
        labs(title="LA Rideshare trips by week: Apr 30 - Jun 03, 2022",
             subtitle="Dotted lines for Mondays", 
             x="Day", y="Trip Count",
             caption = "Figure 3-1") +
        plotTheme + theme(panel.grid.major = element_blank())    
```

Next, the time `lag` features are tested for correlation with `Trip_Count`. `plotData.lag` returns the `Trip_Count` and time lag features for week 18. `fct_relevel` reorders the lag levels. Omit that line and try `levels(plotData.lag$Variable)`.

Pearson correlation is then calculated for each `Variable` in `correlation.lag`.

Weak `Trip_Count` correlations are visualized below.

```{r time_lag, results='hide', message=FALSE , warning=FALSE}

plotData.lag <-
  filter(as.data.frame(ride.panel), week == 18) %>%
  dplyr::select(starts_with("lag"), Trip_Count) %>%
  gather(Variable, Value, -Trip_Count) %>%
  mutate(Variable = fct_relevel(Variable, "lagHour","lag2Hours","lag3Hours",
                                          "lag4Hours","lag12Hours","lag1day"))
correlation.lag <-
  group_by(plotData.lag, Variable) %>%
    summarize(correlation = round(cor(Value, Trip_Count, use = "complete.obs"), 2)) 

plotData.lag %>%
ggplot()+
  geom_point(aes(x= Value, y = Trip_Count))+
  geom_text(data = correlation.lag, aes(label = paste("r =", round(correlation, 2))),
            x=-Inf, y=Inf, vjust = 1.5, hjust = -.1) +
  geom_smooth(aes(x= Value, y= Trip_Count), method = "lm", se = FALSE, color = "73607D")+
  geom_abline(slope = 1, intercept = 0)+
  facet_wrap(~Variable, ncol = 2, scales = "free") +
  labs(title="Ridershare trip count as a function of time lags",
       x="Lag Trip Count", 
       y="Trip_Count",
       caption = "Figure 3-2")+
  plotTheme
```

Figure 3-3 illustrates the distribution of trip volume by station for different times of the day. There are some stations with high volume periods, but more have low volume. Therefore, our data will consist of a number of low demand station/hours and a few high demand station hours.

We can also track the daily trends in ridership by day of the week and weekend versus weekday, to see what temporal patterns we'd like to control for.

```{r mean_trips_hist, warning = FALSE, message = FALSE }
dat_census %>%
        mutate(time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
                                 hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
                                 hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
                                 hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush"))%>%
         group_by(interval60, start_station, time_of_day) %>%
         tally()%>%
  group_by(start_station, time_of_day)%>%
  summarize(mean_trips = mean(n))%>%
  ggplot()+
  geom_histogram(aes(mean_trips), binwidth = 1)+
  labs(title="Mean Number of Hourly Trips Per Station. LA, Apr 30 - Jun 03, 2022",
       x="Number of trips", 
       y="Frequency",
       caption = "Figure 3-3")+
  facet_wrap(~time_of_day)+
  plotTheme
```

```{r trips_station_dotw }
ggplot(dat_census %>%
         group_by(interval60, start_station) %>%
         tally())+
  geom_histogram(aes(n), binwidth = 5)+
  labs(title="Bike share trips per hr by station. LA, Apr 30 - Jun 03, 2022",
       x="Trip Counts", 
       y="Number of Stations",
       caption = "Figure 3-4")+
  plotTheme
```

Figure 3-5 and 3-6 illustrate total `Trip_Counts` distribution in one day by weekdays and  weekends. It clear that the trip volume of each day in a week have similar patterns: reaching peak at about 400 in 12:00-17:00 and falling at near 0 value in 3:00 or 24:00. Weekdays have almost double peak volume than weekends.

```{r trips_hour_dotw }
ggplot(dat_census %>% mutate(hour = hour(new_start_time)))+
     geom_freqpoly(aes(hour, color = dotw), binwidth = 1)+
  labs(title="Bike share trips in LA, by day of the week, May, 2022",
       x="Hour", 
       y="Trip Counts",
       caption = "Figure 3-5")+
     plotTheme
ggplot(dat_census %>% 
         mutate(hour = hour(new_start_time),
                weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday")))+
     geom_freqpoly(aes(hour, color = weekend), binwidth = 1)+
  labs(title="Bike share trips in LA - weekend vs weekday, Apr 30 - Jun 03, 2022",
       x="Hour", 
       y="Trip Counts",
       caption = "Figure 3-6")+
     plotTheme
```

### 3.4.2 Spatial autocorrelation

In this part, the spatial autocorrelation will be examined. Figure 3-7 shows the `Trip-Count` per hour by station in weekdays and weekends, respectively.

```{r origin_map }
ggplot()+
  geom_sf(data = LATracts %>%
          st_transform(crs=4326))+
  geom_point(data = dat_census %>% 
            mutate(hour = hour(new_start_time),
                weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
                time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
                                 hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
                                 hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
                                 hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush"))%>%
              group_by(start_station, start_lan, start_lon, weekend, time_of_day) %>%
              tally(),
            aes(x=start_lon, y = start_lan, color = n), 
            fill = "transparent", alpha = 0.5, size = 0.5)+
  scale_colour_viridis(direction = -1,
  discrete = FALSE, option = "D")+
  ylim(min(dat_census$start_lan), max(dat_census$start_lan))+
  xlim(min(dat_census$start_lon), max(dat_census$start_lon))+
  facet_grid(weekend ~ time_of_day)+
  labs(title="Bike share trips per hr by station. LA, May, 2022",
       caption = "Figure 3-7")+
  mapTheme
```

### 3.4.3 Space/time correlation

```{r week_panel}
week20 <-
  filter(ride.panel , week == 20 & dotw == "Mon")

week20.panel <-
  expand.grid(
    interval60 = unique(week20$interval60),
    start_station = unique(ride.panel$start_station))
```

```{r  results='hide', message=FALSE , warning=FALSE}
ride.animation.data <-
  mutate(week20, Trip_Counter = 1) %>%
  select(interval60, start_station, start_lon, start_lan, Trip_Counter) %>%
    right_join(week20.panel) %>% 
    group_by(interval60, start_station, start_lon, start_lan,) %>%
    summarize(Trip_Count = sum(Trip_Counter, na.rm=T)) %>% 
    ungroup() %>%
    mutate(Trips = case_when(Trip_Count == 0 ~ "0 trips",
                             Trip_Count > 0 & Trip_Count <= 2 ~ "1-2 trips",
                             Trip_Count > 2 & Trip_Count <= 5 ~ "3-5 trips",
                             Trip_Count > 5 & Trip_Count <= 10 ~ "6-10 trips",
                             Trip_Count > 10 & Trip_Count <= 15 ~ "11-15 trips",
                             Trip_Count > 15 ~ "16+ trips")) %>%
    mutate(Trips  = fct_relevel(Trips, "0 trips","1-2 trips","3-5 trips",
                                       "6-10 trips","6-10 trips","16+ trips"))
```

```{r  results='hide', message=FALSE , warning=FALSE}
rideshare_animation <-
  ggplot() +
    geom_sf(data = LATracts %>%
              st_transform(crs=4326)) +
    geom_point(data = ride.animation.data, 
               aes(x = start_lon, y = start_lan, fill = Trips), size = 0.5, alpha = 1.5)+
    scale_color_manual(values = palette5) +
    labs(title = "Rideshare pickups for one day in May 2022, LA",
         subtitle = "60 minute intervals: {current_frame}",
         caption = "Figure 3-8") +
    transition_manual(interval60) +
    mapTheme

animate(rideshare_animation, duration=75, renderer = gifski_renderer())
```
```{r}
rideshare_animation <-
  ggplot() +
    geom_sf(data = ride.animation.data, aes(fill = Trips)) +
    scale_fill_manual(values = palette5) +
    labs(title = "Rideshare pickups for one day in November 2018",
         subtitle = "15 minute intervals: {current_frame}") +
    transition_manual(interval15) +
    mapTheme
```

# 4 Modelling

## 4.1 Building model

In this section, five different linear regressions are estimated on `ride.Train`, each with different fixed effects:
1. `reg1` focuses on just time, including hour fixed effects, day of the week, and `Temperature`.
2. `reg2` focuses on just space effects with the `start_station` fixed effects.
3. `reg3` includes both time and space fixed effects.
4. `reg4` adds the time `lag` features.
5. `reg5` adds the holiday `lag` features.

```{r five_models }
reg1 <- 
  lm(Trip_Count ~  hour(interval60) + dotw + Temperature,  data=ride.Train)
reg2 <- 
  lm(Trip_Count ~  start_station + dotw + Temperature,  data=ride.Train)
reg3 <- 
  lm(Trip_Count ~  start_station + hour(interval60) + dotw + Temperature + Precipitation, 
     data=ride.Train)
reg4 <- 
  lm(Trip_Count ~  start_station +  hour(interval60) + dotw + Temperature + Precipitation +
                   lagHour + lag2Hours +lag3Hours + lag12Hours + lag1day, 
     data=ride.Train)
reg5 <- 
  lm(Trip_Count ~  start_station + hour(interval60) + dotw + Temperature + Precipitation +
                   lagHour + lag2Hours +lag3Hours +lag12Hours + lag1day + holiday, # + holidayLag ,
     data=ride.Train)
```

## 4.2 Predict for test data

Models built above will be used for predicting data in `ride.Test` which includes 2 sf tibbles (with geometries). 

```{r nest_data , warning = FALSE, message = FALSE}
ride.Test.weekNest <- 
  ride.Test %>%
  nest(-week) 

ride.Test.weekNest
```

Next, a function is created that takes a tibble, `dat` and a regression model, `fit` as its inputs, and outputs predictions as `pred`. Each week in `ride.Trest.weekNest` can be predicted by this function.

```{r predict_function }
model_pred <- function(dat, fit){
   pred <- predict(fit, newdata = dat)}
```

The code below will loop through each model for each testing week and mutate summary statistics, including the predictions, observed, absolute error, MAE and SD AE.

```{r do_predicitons, results='hide', message=FALSE , warning=FALSE }
week_predictions <- 
  ride.Test.weekNest %>% 
    mutate(ATime_FE = map(.x = data, fit = reg1, .f = model_pred),
           BSpace_FE = map(.x = data, fit = reg2, .f = model_pred),
           CTime_Space_FE = map(.x = data, fit = reg3, .f = model_pred),
           DTime_Space_FE_timeLags = map(.x = data, fit = reg4, .f = model_pred),
           ETime_Space_FE_timeLags_holidayLags = map(.x = data, fit = reg5, .f = model_pred)) %>% 
  
    gather(Regression, Prediction, -data, -week) %>%
    mutate(Observed = map(data, pull, Trip_Count),
           Absolute_Error = map2(Observed, Prediction,  ~ abs(.x - .y)),
           MAE = map_dbl(Absolute_Error, mean, na.rm = TRUE),
           sd_AE = map_dbl(Absolute_Error, sd, na.rm = TRUE))
week_predictions
```

A 2 (weeks) times 5 (models) table is created above, each row presents the statistics of one week in `ride.Trest.weekNest` predicted by a regression model.

# 5 Examing accuracy

In this section, Mean Absolute Error(MAE) is calculated on `ride.Test` for each model.

## 5.1 Examine Error Metrics for Accuracy

Figure 5-1 shows goodness of fit by week and model. The MAE for the time effects model (`reg1`) in week 21 is comparable to the mean observed `Trip_Count` of 0.256. With increasing sophistication, the model becomes more accurate and more generalizable.

```{r plot_errors_by_model }
week_predictions %>%
  dplyr::select(week, Regression, MAE) %>%
  gather(Variable, MAE, -Regression, -week) %>%
  ggplot(aes(week, MAE)) + 
    geom_bar(aes(fill = Regression), position = "dodge", stat="identity") +
    scale_fill_manual(values = palette5) +
    labs(title = "Mean Absolute Errors by model specification and week",
         caption = "Figure 5-1") +
  plotTheme
```

For each model, predicted and observed `Trip_Count` is taken out of the spatial context and their means of plotted in time series form below. It seems that model A and C have the same time trend because the spatial context has been removed.

The time lags facilitate the capacity to predict for the highest peaks as sophistication increases. 

```{r error_vs_actual_timeseries , warning = FALSE, message = FALSE}
week_predictions %>% 
    mutate(interval60 = map(data, pull, interval60),
           start_station = map(data, pull, start_station)) %>%
    dplyr::select(interval60, start_station, Observed, Prediction, Regression) %>%
    unnest() %>%
    gather(Variable, Value, -Regression, -interval60, -start_station) %>%
    group_by(Regression, Variable, interval60) %>%
    summarize(Value = sum(Value)) %>%
    ggplot(aes(interval60, Value, colour=Variable)) + 
      geom_line(size = 1.1) + 
      facet_wrap(~Regression, ncol=1) +
      labs(title = "Predicted/Observed bike share time series", subtitle = "LA; A test set of 2 weeks",  x = "Hour", y= "Station Trips",
         caption = "Figure 5-2") +
      plotTheme
```

Figure 5-3 illustrates the observed and predicted distribution for different times of day durig weekdays and weekends. It is clear that the models are under predicting in general.

```{r obs_pred_all, warning=FALSE, message = FALSE, cache=TRUE}
week_predictions %>% 
    mutate(interval60 = map(data, pull, interval60),
           start_station = map(data, pull, start_station), 
           start_lan = map(data, pull, start_lan), 
           start_lon = map(data, pull, start_lon),
           dotw = map(data, pull, dotw)) %>%
    select(interval60, start_lan, start_lon, 
           start_station, Observed, Prediction, Regression,
           dotw) %>%
    unnest() %>%
  filter(Regression == "ETime_Space_FE_timeLags_holidayLags")%>%
  mutate(weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
         time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
                                 hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
                                 hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
                                 hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush"))%>%
  ggplot()+
  geom_point(aes(x= Observed, y = Prediction))+
    geom_smooth(aes(x= Observed, y= Prediction), method = "lm", se = FALSE, color = "red")+
    geom_abline(slope = 1, intercept = 0)+
  facet_grid(time_of_day~weekend)+
  labs(title="Observed vs Predicted",
       x="Observed trips", 
       y="Predicted trips",
         caption = "Figure 5-3")+
  plotTheme
```

## 5.2 Space-Time Error Evaluation

Figure 5-4 plots mean absolute errors on map by weekdays/weekends and time of a day.

```{r station_summary, warning=FALSE, message = FALSE }
week_predictions %>% 
    mutate(interval60 = map(data, pull, interval60),
           start_station = map(data, pull, start_station), 
           start_lan = map(data, pull, start_lan), 
           start_lon = map(data, pull, start_lon),
           dotw = map(data, pull, dotw) ) %>%
    select(interval60, start_station, start_lon, 
           start_lan, Observed, Prediction, Regression,
           dotw) %>%
    unnest() %>%
  filter(Regression == "ETime_Space_FE_timeLags_holidayLags")%>%
  mutate(weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
         time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
                                 hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
                                 hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
                                 hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush")) %>%
  group_by(start_station, weekend, time_of_day, start_lon, start_lan) %>%
  summarize(MAE = mean(abs(Observed-Prediction), na.rm = TRUE))%>%
  ggplot(.)+
  geom_sf(data = LACensus, color = "grey", fill = "transparent")+
  geom_point(aes(x = start_lon, y = start_lan, color = MAE), 
             fill = "transparent", size = 0.5, alpha = 0.4)+
  scale_colour_viridis(direction = -1,
  discrete = FALSE, option = "D")+
  ylim(min(dat_census$start_lan), max(dat_census$start_lan))+
  xlim(min(dat_census$start_lon), max(dat_census$start_lon))+
  facet_grid(weekend~time_of_day)+
  labs(title="Mean Absolute Errors, Test Set",
         caption = "Figure 5-4")+
  mapTheme
  
```

## 5.3 Socio-economic variables

One might think that the station locations probably relate to likely users, who seem to be commuting downtown to the loop. Figure 5-4 plots MAE by three socio-economic variables: median income, percent taking public transportation and percent of white. Several stations show resistance to the models - they have high income, low public transportation usage and more than a half of white population.

```{r station_summary2, warning=FALSE, message = FALSE }
week_predictions %>% 
    mutate(interval60 = map(data, pull, interval60),
           start_station = map(data, pull, start_station), 
           start_lan = map(data, pull, start_lan), 
           start_lon = map(data, pull, start_lon),
           dotw = map(data, pull, dotw),
           Percent_Taking_Public_Trans = map(data, pull, Percent_Taking_Public_Trans),
           Med_Inc = map(data, pull, Med_Inc),
           Percent_White = map(data, pull, Percent_White)) %>%
    select(interval60, start_station, start_lon, 
           start_lan, Observed, Prediction, Regression,
           dotw, Percent_Taking_Public_Trans, Med_Inc, Percent_White) %>%
    unnest() %>%
  filter(Regression == "ETime_Space_FE_timeLags_holidayLags")%>%
  mutate(weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
         time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
                                 hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
                                 hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
                                 hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush")) %>%
  filter(time_of_day == "AM Rush") %>%
  group_by(start_station, Percent_Taking_Public_Trans, Med_Inc, Percent_White) %>%
  summarize(MAE = mean(abs(Observed-Prediction), na.rm = TRUE))%>%
  gather(-start_station, -MAE, key = "variable", value = "value")%>%
  ggplot(.)+
  #geom_sf(data = LACensus, color = "grey", fill = "transparent")+
  geom_point(aes(x = value, y = MAE), alpha = 0.4)+
  geom_smooth(aes(x = value, y = MAE), method = "lm", se= FALSE)+
  facet_wrap(~variable, scales = "free")+
  labs(title="Errors as a function of socio-economic variables",
       y="Mean Absolute Error (Trips)",
         caption = "Figure 5-4")+
  plotTheme
  
```

# 6 Cross-Validation

Below, a parameter called `fitControl` is set to specify the number of k-fold prtitions - in this case 100. In the code below, `set.seed` ensures reproducible folds. An object `reg.cv`, is estimated using the same regression as specified in `reg 4`.

```{r cv}
fitControl <- trainControl(method = "cv", number = 100)
set.seed(8888)

reg.cv <- 
  train(Trip_Count ~  start_station +  hour(interval60) + dotw + Temperature + Precipitation +
                   lagHour + lag2Hours +lag3Hours + lag12Hours + lag1day, 
     data=ride.panel, 
     method = "lm", trControl = fitControl, na.action = na.pass)

reg.cv
```

The cross-validation output provides very important goodness of fit information. The `resample` shows the first five outputs of 100 folds.

```{r}
reg.cv$resample[1:5,]
```

```{r plot_cv, results='hide', message=FALSE , warning=FALSE}
MAE <- reg.cv$resample[,3]
df <- data.frame(MAE)

ggplot(df, aes(x=MAE)) +
  geom_histogram(aes(y=..density..),bins=35, fill="#FB8C6F") +
  labs(
    x="Cross-Validation MAE Histogram",
    y="MAE",
    caption = "Figure 6-1"
  )
plotTheme
```

# 7 Conclusion

I began by exploring serial autocorrelation and spatial autocorrelation of the LA Metro Bike Share trip data. Bike share counts Data by station shows certain time patterns during one week or one day: bikes are highly used on weekends and less on Wednesdays and Thursdays, 12:00-17:00 every day is the peak usage period. Bike share counts also shows spatial agglomeration, that is certain stations tend to have higher bike usages.

Then, five regression models are built. Divide the first three weeks as the training set, and the next two weeks as the test set. By calculating the prediction and error data of the model in the test set, the accuracy can be examined that the errors are relatively small. 

